{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##Run this cell only once##\n",
    "import nltk\n",
    "nltk.download()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import category_encoders as ce\n",
    "text_file = open('data/example_text.txt', 'rt')\n",
    "text = text_file.read()\n",
    "text_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Science sits at the core of any analytical exercise conducted on a Big Data or Internet of Things (IoT ) environment.\n"
     ]
    }
   ],
   "source": [
    "## Sentence tokenization ##\n",
    "from nltk import sent_tokenize\n",
    "sentence = sent_tokenize(text)\n",
    "print(sentence[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Data', 'Science', 'sits', 'at', 'the', 'core', 'of', 'any', 'analytical', 'exercise', 'conducted', 'on', 'a', 'Big', 'Data', 'or', 'Internet', 'of', 'Things', '(', 'IoT', ')', 'environment', '.', 'Data', 'science', 'involves', 'a', 'wide', 'array', 'of', 'technologies', ',', 'business', ',', 'and', 'machine', 'learning', 'algorithms', '.', 'The', 'purpose', 'of', 'data', 'science', 'is', 'just', 'not', 'doing', 'machine']\n"
     ]
    }
   ],
   "source": [
    "## Word tokenization ##\n",
    "from nltk import word_tokenize\n",
    "words = word_tokenize(text)\n",
    "print(words[:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Data', 'Science', 'sits', 'at', 'the', 'core', 'of', 'any', 'analytical', 'exercise', 'conducted', 'on', 'a', 'Big', 'Data', 'or', 'Internet', 'of', 'Things', 'IoT', 'environment', 'Data', 'science', 'involves', 'a', 'wide', 'array', 'of', 'technologies', 'business', 'and', 'machine', 'learning', 'algorithms', 'The', 'purpose', 'of', 'data', 'science', 'is', 'just', 'not', 'doing', 'machine', 'learning', 'or', 'statistical', 'analysis', 'but', 'also']\n"
     ]
    }
   ],
   "source": [
    "# Remove punctuations and keep only alphabets\n",
    "words_cleaned = [word for word in words if word.isalpha()]\n",
    "print(words_cleaned[:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Data', 'Science', 'sits', 'core', 'analytical', 'exercise', 'conducted', 'Big', 'Data', 'Internet', 'Things', 'IoT', 'environment', 'Data', 'science', 'involves', 'wide', 'array', 'technologies', 'business', 'machine', 'learning', 'algorithms', 'The', 'purpose', 'data', 'science', 'machine', 'learning', 'statistical', 'analysis', 'also', 'derive', 'insights', 'data', 'user', 'statistics', 'knowledge', 'understand', 'In', 'fast', 'paced', 'environment', 'Big', 'Data', 'IoT', 'type', 'data', 'might', 'vary']\n"
     ]
    }
   ],
   "source": [
    "# remove the stop words\n",
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words('english'))\n",
    "words_1 = [word for word in words_cleaned if not word in stop_words]\n",
    "print(words_1[:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['data', 'science', 'sits', 'core', 'analytical', 'exercise', 'conducted', 'big', 'data', 'internet', 'things', 'iot', 'environment', 'data', 'science', 'involves', 'wide', 'array', 'technologies', 'business', 'machine', 'learning', 'algorithms', 'the', 'purpose', 'data', 'science', 'machine', 'learning', 'statistical', 'analysis', 'also', 'derive', 'insights', 'data', 'user', 'statistics', 'knowledge', 'understand', 'in', 'fast', 'paced', 'environment', 'big', 'data', 'iot', 'type', 'data', 'might', 'vary']\n"
     ]
    }
   ],
   "source": [
    "# Case folding\n",
    "words_lower = [words_1.lower() for words_1 in words_1]\n",
    "print(words_lower[:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['data', 'scienc', 'sit', 'core', 'analyt', 'exercis', 'conduct', 'big', 'data', 'internet', 'thing', 'iot', 'environ', 'data', 'scienc', 'involv', 'wide', 'array', 'technolog', 'busi', 'machin', 'learn', 'algorithm', 'the', 'purpos', 'data', 'scienc', 'machin', 'learn', 'statist', 'analysi', 'also', 'deriv', 'insight', 'data', 'user', 'statist', 'knowledg', 'understand', 'in', 'fast', 'pace', 'environ', 'big', 'data', 'iot', 'type', 'data', 'might', 'vari']\n"
     ]
    }
   ],
   "source": [
    "#Stemming\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "porter = PorterStemmer()\n",
    "stemmed_words = [porter.stem(word) for word in words_lower]\n",
    "print(stemmed_words[:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
